{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b2545b69",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/file_utils.ipynb\"\n",
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "478b5275",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "leaf leaf\n",
      "leaves leave\n",
      "booking book\n",
      "writing write\n",
      "completed complete\n",
      "stemming stem\n"
     ]
    }
   ],
   "source": [
    "words = [\"leaf\", \"leaves\", \"booking\", \"writing\", \"completed\", \"stemming\"]\n",
    "docs = [small_model(word) for word in words]\n",
    "for doc in docs:\n",
    "    for token in doc:\n",
    "        print(token, token.lemma_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7ddd1f33",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "To to\n",
      "Sherlock Sherlock\n",
      "Holmes Holmes\n",
      "she she\n",
      "is be\n",
      "always always\n",
      "_ _\n",
      "the the\n",
      "_ _\n",
      "woman woman\n",
      ". .\n",
      "I I\n",
      "have have\n",
      "seldom seldom\n",
      "heard hear\n",
      "him he\n",
      "\n",
      " \n",
      "\n",
      "mention mention\n",
      "her she\n",
      "under under\n",
      "any any\n",
      "other other\n",
      "name name\n",
      ". .\n",
      "In in\n",
      "his his\n",
      "eyes eye\n",
      "she she\n",
      "eclipses eclipse\n",
      "and and\n",
      "\n",
      " \n",
      "\n",
      "predominates predominate\n",
      "the the\n",
      "whole whole\n",
      "of of\n",
      "her her\n",
      "sex sex\n",
      ". .\n",
      "It it\n",
      "was be\n",
      "not not\n",
      "that that\n",
      "he he\n",
      "felt feel\n",
      "any any\n",
      "emotion emotion\n",
      "\n",
      " \n",
      "\n",
      "akin akin\n",
      "to to\n",
      "love love\n",
      "for for\n",
      "Irene Irene\n",
      "Adler Adler\n",
      ". .\n",
      "All all\n",
      "emotions emotion\n",
      ", ,\n",
      "and and\n",
      "that that\n",
      "one one\n",
      "particularly particularly\n",
      ", ,\n",
      "\n",
      " \n",
      "\n",
      "were be\n",
      "abhorrent abhorrent\n",
      "to to\n",
      "his his\n",
      "cold cold\n",
      ", ,\n",
      "precise precise\n",
      "but but\n",
      "admirably admirably\n",
      "balanced balanced\n",
      "mind mind\n",
      ". .\n",
      "He he\n",
      "\n",
      " \n",
      "\n",
      "was be\n",
      ", ,\n",
      "I I\n",
      "take take\n",
      "it it\n",
      ", ,\n",
      "the the\n",
      "most most\n",
      "perfect perfect\n",
      "reasoning reasoning\n",
      "and and\n",
      "observing observe\n",
      "machine machine\n",
      "that that\n",
      "\n",
      " \n",
      "\n",
      "the the\n",
      "world world\n",
      "has have\n",
      "seen see\n",
      ", ,\n",
      "but but\n",
      "as as\n",
      "a a\n",
      "lover lover\n",
      "he he\n",
      "would would\n",
      "have have\n",
      "placed place\n",
      "himself himself\n",
      "in in\n",
      "a a\n",
      "\n",
      " \n",
      "\n",
      "false false\n",
      "position position\n",
      ". .\n",
      "He he\n",
      "never never\n",
      "spoke speak\n",
      "of of\n",
      "the the\n",
      "softer soft\n",
      "passions passion\n",
      ", ,\n",
      "save save\n",
      "with with\n",
      "a a\n",
      "gibe gibe\n",
      "\n",
      " \n",
      "\n",
      "and and\n",
      "a a\n",
      "sneer sneer\n",
      ". .\n",
      "They they\n",
      "were be\n",
      "admirable admirable\n",
      "things thing\n",
      "for for\n",
      "the the\n",
      "observer observer\n",
      "— —\n",
      "excellent excellent\n",
      "for for\n",
      "\n",
      " \n",
      "\n",
      "drawing draw\n",
      "the the\n",
      "veil veil\n",
      "from from\n",
      "men man\n",
      "’s ’s\n",
      "motives motive\n",
      "and and\n",
      "actions action\n",
      ". .\n",
      "But but\n",
      "for for\n",
      "the the\n",
      "trained train\n",
      "\n",
      " \n",
      "\n",
      "reasoner reasoner\n",
      "to to\n",
      "admit admit\n",
      "such such\n",
      "intrusions intrusion\n",
      "into into\n",
      "his his\n",
      "own own\n",
      "delicate delicate\n",
      "and and\n",
      "finely finely\n",
      "\n",
      " \n",
      "\n",
      "adjusted adjust\n",
      "temperament temperament\n",
      "was be\n",
      "to to\n",
      "introduce introduce\n",
      "a a\n",
      "distracting distract\n",
      "factor factor\n",
      "which which\n",
      "might might\n",
      "\n",
      " \n",
      "\n",
      "throw throw\n",
      "a a\n",
      "doubt doubt\n",
      "upon upon\n",
      "all all\n",
      "his his\n",
      "mental mental\n",
      "results result\n",
      ". .\n",
      "Grit grit\n",
      "in in\n",
      "a a\n",
      "sensitive sensitive\n",
      "\n",
      " \n",
      "\n",
      "instrument instrument\n",
      ", ,\n",
      "or or\n",
      "a a\n",
      "crack crack\n",
      "in in\n",
      "one one\n",
      "of of\n",
      "his his\n",
      "own own\n",
      "high high\n",
      "- -\n",
      "power power\n",
      "lenses lense\n",
      ", ,\n",
      "would would\n",
      "not not\n",
      "\n",
      " \n",
      "\n",
      "be be\n",
      "more more\n",
      "disturbing disturbing\n",
      "than than\n",
      "a a\n",
      "strong strong\n",
      "emotion emotion\n",
      "in in\n",
      "a a\n",
      "nature nature\n",
      "such such\n",
      "as as\n",
      "his his\n",
      ". .\n",
      "And and\n",
      "\n",
      " \n",
      "\n",
      "yet yet\n",
      "there there\n",
      "was be\n",
      "but but\n",
      "one one\n",
      "woman woman\n",
      "to to\n",
      "him he\n",
      ", ,\n",
      "and and\n",
      "that that\n",
      "woman woman\n",
      "was be\n",
      "the the\n",
      "late late\n",
      "Irene Irene\n",
      "\n",
      " \n",
      "\n",
      "Adler Adler\n",
      ", ,\n",
      "of of\n",
      "dubious dubious\n",
      "and and\n",
      "questionable questionable\n",
      "memory memory\n",
      ". .\n"
     ]
    }
   ],
   "source": [
    "text = read_text_file(\"../data/sherlock_holmes_1.txt\")\n",
    "doc = small_model(text)\n",
    "for token in doc:\n",
    "    print(token, token.lemma_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0a1eacf7",
   "metadata": {},
   "outputs": [],
   "source": [
    "lemmatizer = None\n",
    "for name, proc in small_model.pipeline:\n",
    "    if name == \"lemmatizer\":\n",
    "        lemmatizer = proc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "21c7d0c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "To is in its base form: False\n",
      "Sherlock is in its base form: False\n",
      "Holmes is in its base form: False\n",
      "she is in its base form: False\n",
      "is is in its base form: False\n",
      "always is in its base form: False\n",
      "_ is in its base form: False\n",
      "the is in its base form: False\n",
      "_ is in its base form: False\n",
      "woman is in its base form: True\n",
      ". is in its base form: False\n",
      "I is in its base form: False\n",
      "have is in its base form: False\n",
      "seldom is in its base form: False\n",
      "heard is in its base form: False\n",
      "him is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "mention is in its base form: True\n",
      "her is in its base form: False\n",
      "under is in its base form: False\n",
      "any is in its base form: False\n",
      "other is in its base form: True\n",
      "name is in its base form: True\n",
      ". is in its base form: False\n",
      "In is in its base form: False\n",
      "his is in its base form: False\n",
      "eyes is in its base form: False\n",
      "she is in its base form: False\n",
      "eclipses is in its base form: False\n",
      "and is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "predominates is in its base form: False\n",
      "the is in its base form: False\n",
      "whole is in its base form: True\n",
      "of is in its base form: False\n",
      "her is in its base form: False\n",
      "sex is in its base form: True\n",
      ". is in its base form: False\n",
      "It is in its base form: False\n",
      "was is in its base form: False\n",
      "not is in its base form: False\n",
      "that is in its base form: False\n",
      "he is in its base form: False\n",
      "felt is in its base form: False\n",
      "any is in its base form: False\n",
      "emotion is in its base form: True\n",
      "\n",
      " is in its base form: False\n",
      "akin is in its base form: True\n",
      "to is in its base form: False\n",
      "love is in its base form: True\n",
      "for is in its base form: False\n",
      "Irene is in its base form: False\n",
      "Adler is in its base form: False\n",
      ". is in its base form: False\n",
      "All is in its base form: False\n",
      "emotions is in its base form: False\n",
      ", is in its base form: False\n",
      "and is in its base form: False\n",
      "that is in its base form: False\n",
      "one is in its base form: False\n",
      "particularly is in its base form: False\n",
      ", is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "were is in its base form: False\n",
      "abhorrent is in its base form: True\n",
      "to is in its base form: False\n",
      "his is in its base form: False\n",
      "cold is in its base form: True\n",
      ", is in its base form: False\n",
      "precise is in its base form: True\n",
      "but is in its base form: False\n",
      "admirably is in its base form: False\n",
      "balanced is in its base form: True\n",
      "mind is in its base form: True\n",
      ". is in its base form: False\n",
      "He is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "was is in its base form: False\n",
      ", is in its base form: False\n",
      "I is in its base form: False\n",
      "take is in its base form: True\n",
      "it is in its base form: False\n",
      ", is in its base form: False\n",
      "the is in its base form: False\n",
      "most is in its base form: False\n",
      "perfect is in its base form: True\n",
      "reasoning is in its base form: True\n",
      "and is in its base form: False\n",
      "observing is in its base form: False\n",
      "machine is in its base form: True\n",
      "that is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "the is in its base form: False\n",
      "world is in its base form: True\n",
      "has is in its base form: False\n",
      "seen is in its base form: False\n",
      ", is in its base form: False\n",
      "but is in its base form: False\n",
      "as is in its base form: False\n",
      "a is in its base form: False\n",
      "lover is in its base form: True\n",
      "he is in its base form: False\n",
      "would is in its base form: False\n",
      "have is in its base form: True\n",
      "placed is in its base form: False\n",
      "himself is in its base form: False\n",
      "in is in its base form: False\n",
      "a is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "false is in its base form: True\n",
      "position is in its base form: True\n",
      ". is in its base form: False\n",
      "He is in its base form: False\n",
      "never is in its base form: False\n",
      "spoke is in its base form: False\n",
      "of is in its base form: False\n",
      "the is in its base form: False\n",
      "softer is in its base form: False\n",
      "passions is in its base form: False\n",
      ", is in its base form: False\n",
      "save is in its base form: True\n",
      "with is in its base form: False\n",
      "a is in its base form: False\n",
      "gibe is in its base form: True\n",
      "\n",
      " is in its base form: False\n",
      "and is in its base form: False\n",
      "a is in its base form: False\n",
      "sneer is in its base form: True\n",
      ". is in its base form: False\n",
      "They is in its base form: False\n",
      "were is in its base form: False\n",
      "admirable is in its base form: True\n",
      "things is in its base form: False\n",
      "for is in its base form: False\n",
      "the is in its base form: False\n",
      "observer is in its base form: True\n",
      "— is in its base form: False\n",
      "excellent is in its base form: True\n",
      "for is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "drawing is in its base form: False\n",
      "the is in its base form: False\n",
      "veil is in its base form: True\n",
      "from is in its base form: False\n",
      "men is in its base form: False\n",
      "’s is in its base form: False\n",
      "motives is in its base form: False\n",
      "and is in its base form: False\n",
      "actions is in its base form: False\n",
      ". is in its base form: False\n",
      "But is in its base form: False\n",
      "for is in its base form: False\n",
      "the is in its base form: False\n",
      "trained is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "reasoner is in its base form: True\n",
      "to is in its base form: False\n",
      "admit is in its base form: True\n",
      "such is in its base form: True\n",
      "intrusions is in its base form: False\n",
      "into is in its base form: False\n",
      "his is in its base form: False\n",
      "own is in its base form: True\n",
      "delicate is in its base form: True\n",
      "and is in its base form: False\n",
      "finely is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "adjusted is in its base form: False\n",
      "temperament is in its base form: True\n",
      "was is in its base form: False\n",
      "to is in its base form: False\n",
      "introduce is in its base form: True\n",
      "a is in its base form: False\n",
      "distracting is in its base form: False\n",
      "factor is in its base form: True\n",
      "which is in its base form: False\n",
      "might is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "throw is in its base form: True\n",
      "a is in its base form: False\n",
      "doubt is in its base form: True\n",
      "upon is in its base form: False\n",
      "all is in its base form: False\n",
      "his is in its base form: False\n",
      "mental is in its base form: True\n",
      "results is in its base form: False\n",
      ". is in its base form: False\n",
      "Grit is in its base form: True\n",
      "in is in its base form: False\n",
      "a is in its base form: False\n",
      "sensitive is in its base form: True\n",
      "\n",
      " is in its base form: False\n",
      "instrument is in its base form: True\n",
      ", is in its base form: False\n",
      "or is in its base form: False\n",
      "a is in its base form: False\n",
      "crack is in its base form: True\n",
      "in is in its base form: False\n",
      "one is in its base form: False\n",
      "of is in its base form: False\n",
      "his is in its base form: False\n",
      "own is in its base form: True\n",
      "high is in its base form: True\n",
      "- is in its base form: False\n",
      "power is in its base form: True\n",
      "lenses is in its base form: False\n",
      ", is in its base form: False\n",
      "would is in its base form: False\n",
      "not is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "be is in its base form: True\n",
      "more is in its base form: False\n",
      "disturbing is in its base form: True\n",
      "than is in its base form: False\n",
      "a is in its base form: False\n",
      "strong is in its base form: True\n",
      "emotion is in its base form: True\n",
      "in is in its base form: False\n",
      "a is in its base form: False\n",
      "nature is in its base form: True\n",
      "such is in its base form: True\n",
      "as is in its base form: False\n",
      "his is in its base form: False\n",
      ". is in its base form: False\n",
      "And is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "yet is in its base form: False\n",
      "there is in its base form: False\n",
      "was is in its base form: False\n",
      "but is in its base form: False\n",
      "one is in its base form: False\n",
      "woman is in its base form: True\n",
      "to is in its base form: False\n",
      "him is in its base form: False\n",
      ", is in its base form: False\n",
      "and is in its base form: False\n",
      "that is in its base form: False\n",
      "woman is in its base form: True\n",
      "was is in its base form: False\n",
      "the is in its base form: False\n",
      "late is in its base form: True\n",
      "Irene is in its base form: False\n",
      "\n",
      " is in its base form: False\n",
      "Adler is in its base form: False\n",
      ", is in its base form: False\n",
      "of is in its base form: False\n",
      "dubious is in its base form: True\n",
      "and is in its base form: False\n",
      "questionable is in its base form: True\n",
      "memory is in its base form: True\n",
      ". is in its base form: False\n"
     ]
    }
   ],
   "source": [
    "for token in doc:\n",
    "    print(f\"{token} is in its base form: {lemmatizer.is_base_form(token)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d75ce84a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "small_model.pipe_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b44e268d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
